##################################################################
##################################################################
##################################################################
#
#		A systematic approach to study electoral fraud
#
#		Lucas Leemann and Daniel Bochsler
#
##################################################################
##################################################################
##################################################################

# v4.0  -- 07/14/2015
rm(list=ls())
library(foreign)
setwd("/Users/lleemann/Dropbox/benford/AAA replication data")


##################################################################
# Define function to cary out Benford test
chi2.digitest <- function(dig, alpha=0.05, vec.th=rep(0.1,10)){
  Nsample <- length(dig)
  vec.emp <- rep(0,10)
  for (i in 1:10){
  	vec.emp[i] <- length(which(dig==(i-1)))
  		}  
  vec.th <- vec.th*Nsample
  test.vec <- vec.emp-vec.th
  test.vec2 <- test.vec^2
  test.vec3 <- test.vec2/vec.th
  outi <- sum(test.vec3)
  pval <- 1-pchisq(outi,9)
  w <- 1+1
  cat(" Test statistic is",round(outi,2),"and follows a chi2 distribution with 9 df","\n","The p-value for H_0 is",round(pval,3),"(alpha=",alpha,")","\n","(H_0: Digits follow a mixed Benford distribution)")
  }
  
chi2.digitest22 <- function(dig, alpha=0.05, vec.th=rep(0.1,10)){
  Nsample <- length(dig)
  vec.emp <- rep(0,10)
  for (i in 1:10){
  	vec.emp[i] <- length(which(dig==(i-1)))
  	}
  vec.th <- vec.th*Nsample
  test.vec <- vec.emp-vec.th
  test.vec2 <- test.vec^2
  test.vec3 <- test.vec2/vec.th
  outi <- sum(test.vec3)
  pval <- 1-pchisq(outi,9)
  return(pval)
  }


##################################################################
# Define Benford distribution
benford1 <- c(0, 0.30103, 0.17609, 0.12494, 0.09691, 0.07918, 0.06695, 0.05799, 0.05115, 0.04576)
benford2 <- c(0.11968,0.11389,0.10882,0.10433,0.10031,0.09668,0.09337,0.09035,0.08757,0.085)
benford3 <- c(0.10178, 0.10138, 0.10097, 0.10057, 0.10018, 0.09979, 0.09940, 0.09902, 0.09864, 0.09827)
benford4 <- c(0.10018, 0.10014, 0.10010, 0.10006, 0.10002, 0.09998, 0.09994, 0.09990, 0.09986,0.09982 )
benford5 <- rep(.1,10)
benford6 <- rep(.1,10)



##################################################################
# Results for Table 5
##################################################################

# Yes votes
test.data1 <- read.dta("YESvotes.dta")
test.data10 <- test.data1[test.data1$lostballots==0,]
test.data11 <- test.data1[test.data1$lostballots==1,]

# No votes
test.data <- read.dta("NOvotes.dta")
test.data0 <- test.data[test.data$lostballots==0,]
test.data1 <- test.data[test.data$lostballots==1,]


### Parliament Bill 
# YES
data.gr0 <- cbind(test.data10$lostballots, test.data10$yes_engr, test.data10$zif1_gr, test.data10$zif2_gr, test.data10$zif3_gr, test.data10$zif4_gr, test.data10$zif5_gr, test.data10$digitnr_gr)
zaehl0 <- rep(0, 5)
zaehl0[1] <- length(which(test.data10$digitnr_gr==1))
zaehl0[2] <- length(which(test.data10$digitnr_gr==2))
zaehl0[3] <- length(which(test.data10$digitnr_gr==3))
zaehl0[4] <- length(which(test.data10$digitnr_gr==4))
zaehl0[5] <- length(which(test.data10$digitnr_gr==5))
theor.ver.gr0 <- zaehl0[1]/sum(zaehl0)*benford1 + zaehl0[2]/sum(zaehl0)*benford2 + zaehl0[3]/sum(zaehl0)*benford3 + zaehl0[4]/sum(zaehl0)*benford4 + zaehl0[5]/sum(zaehl0)*benford5
lastdit.gr0 <- round(test.data10$zif1_gr)
chi2.digitest(lastdit.gr0, 0.05, vec.th=theor.ver.gr0)
# NO
data.gr0 <- cbind(test.data0$lostballots, test.data0$yes_engr, test.data0$zif1_gr, test.data0$zif2_gr, test.data0$zif3_gr, test.data0$zif4_gr, test.data0$zif5_gr, test.data0$digitnr_gr)
zaehl0 <- rep(0, 5)
zaehl0[1] <- length(which(test.data0$digitnr_gr==1))
zaehl0[2] <- length(which(test.data0$digitnr_gr==2))
zaehl0[3] <- length(which(test.data0$digitnr_gr==3))
zaehl0[4] <- length(which(test.data0$digitnr_gr==4))
zaehl0[5] <- length(which(test.data0$digitnr_gr==5))
theor.ver.gr0 <- zaehl0[1]/sum(zaehl0)*benford1 + zaehl0[2]/sum(zaehl0)*benford2 + zaehl0[3]/sum(zaehl0)*benford3 + zaehl0[4]/sum(zaehl0)*benford4 + zaehl0[5]/sum(zaehl0)*benford5
lastdit.gr0 <- round(test.data0$zif1_gr)
chi2.digitest(lastdit.gr0, 0.05, vec.th=theor.ver.gr0)


### People's Amendment
# YES
data.vv0 <- cbind(test.data10$lostballots, test.data10$yes_envv, test.data10$zif1_vv, test.data10$zif2_vv, test.data10$zif3_vv, test.data10$zif4_vv, test.data10$zif5_vv, test.data10$digitnr_vv)
zaehl0 <- rep(0, 5)
zaehl0[1] <- length(which(test.data10$digitnr_vv==1))
zaehl0[2] <- length(which(test.data10$digitnr_vv==2))
zaehl0[3] <- length(which(test.data10$digitnr_vv==3))
zaehl0[4] <- length(which(test.data10$digitnr_vv==4))
zaehl0[5] <- length(which(test.data10$digitnr_vv==5))
theor.ver.vv0 <- zaehl0[1]/sum(zaehl0)*benford1 + zaehl0[2]/sum(zaehl0)*benford2 + zaehl0[3]/sum(zaehl0)*benford3 + zaehl0[4]/sum(zaehl0)*benford4 + zaehl0[5]/sum(zaehl0)*benford5
lastdit.vv0 <- round(test.data10$zif1_vv)
chi2.digitest(lastdit.vv0, 0.05, vec.th=theor.ver.vv0) 
# NO
data.vv0 <- cbind(test.data0$lostballots, test.data0$yes_envv, test.data0$zif1_vv, test.data0$zif2_vv, test.data0$zif3_vv, test.data0$zif4_vv, test.data0$zif5_vv, test.data0$digitnr_vv)
zaehl0 <- rep(0, 5)
zaehl0[1] <- length(which(test.data0$digitnr_vv==1))
zaehl0[2] <- length(which(test.data0$digitnr_vv==2))
zaehl0[3] <- length(which(test.data0$digitnr_vv==3))
zaehl0[4] <- length(which(test.data0$digitnr_vv==4))
zaehl0[5] <- length(which(test.data0$digitnr_vv==5))
theor.ver.vv0 <- zaehl0[1]/sum(zaehl0)*benford1 + zaehl0[2]/sum(zaehl0)*benford2 + zaehl0[3]/sum(zaehl0)*benford3 + zaehl0[4]/sum(zaehl0)*benford4 + zaehl0[5]/sum(zaehl0)*benford5
lastdit.vv0 <- round(test.data0$zif1_vv)
chi2.digitest(lastdit.vv0, 0.05, vec.th=theor.ver.vv0)



### Tie-Break Question
# PB
data.st10 <- cbind(test.data10$lostballots, test.data10$yes_enst, test.data10$zif1_st, test.data10$zif2_st, test.data10$zif3_st, test.data10$zif4_st, test.data10$zif5_st, test.data10$digitnr_st)
zaehl0 <- rep(0, 5)
zaehl0[1] <- length(which(test.data10$digitnr_st==1))
zaehl0[2] <- length(which(test.data10$digitnr_st==2))
zaehl0[3] <- length(which(test.data10$digitnr_st==3))
zaehl0[4] <- length(which(test.data10$digitnr_st==4))
zaehl0[5] <- length(which(test.data10$digitnr_st==5))
theor.ver.st0 <- zaehl0[1]/sum(zaehl0)*benford1 + zaehl0[2]/sum(zaehl0)*benford2 + zaehl0[3]/sum(zaehl0)*benford3 + zaehl0[4]/sum(zaehl0)*benford4 + zaehl0[5]/sum(zaehl0)*benford5
lastdit.st0 <- round(test.data10$zif1_st)
chi2.digitest(lastdit.st0, 0.05, vec.th=theor.ver.st0)
# P's A
data.st0 <- cbind(test.data0$lostballots, test.data0$yes_enst, test.data0$zif1_st, test.data0$zif2_st, test.data0$zif3_st, test.data0$zif4_st, test.data0$zif5_st, test.data0$digitnr_st)
zaehl0 <- rep(0, 5)
zaehl0[1] <- length(which(test.data0$digitnr_st==1))
zaehl0[2] <- length(which(test.data0$digitnr_st==2))
zaehl0[3] <- length(which(test.data0$digitnr_st==3))
zaehl0[4] <- length(which(test.data0$digitnr_st==4))
zaehl0[5] <- length(which(test.data0$digitnr_st==5))
theor.ver.st0 <- zaehl0[1]/sum(zaehl0)*benford1 + zaehl0[2]/sum(zaehl0)*benford2 + zaehl0[3]/sum(zaehl0)*benford3 + zaehl0[4]/sum(zaehl0)*benford4 + zaehl0[5]/sum(zaehl0)*benford5
lastdit.st0 <- round(test.data0$zif1_st)
chi2.digitest(lastdit.st0, 0.05, vec.th=theor.ver.st0)


##################################################################
# Results for Table 6
##################################################################


### Parliament Bill
# YES
data.gr1 <- cbind(test.data11$lostballots, test.data11$yes_engr, test.data11$zif1_gr, test.data11$zif2_gr, test.data11$zif3_gr, test.data11$zif4_gr, test.data11$zif5_gr, test.data11$digitnr_gr)
zaehl1 <- rep(1, 5)
zaehl1[1] <- length(which(test.data11$digitnr_gr==1))
zaehl1[2] <- length(which(test.data11$digitnr_gr==2))
zaehl1[3] <- length(which(test.data11$digitnr_gr==3))
zaehl1[4] <- length(which(test.data11$digitnr_gr==4))
zaehl1[5] <- length(which(test.data11$digitnr_gr==5))
theor.ver.gr1 <- zaehl1[1]/sum(zaehl1)*benford1 + zaehl1[2]/sum(zaehl1)*benford2 + zaehl1[3]/sum(zaehl1)*benford3 + zaehl1[4]/sum(zaehl1)*benford4 + zaehl1[5]/sum(zaehl1)*benford5
lastdit.gr1 <- round(test.data11$zif1_gr)
chi2.digitest(lastdit.gr1, 0.05, vec.th=theor.ver.gr1)
# NO
data.gr1 <- cbind(test.data1$lostballots, test.data1$yes_engr, test.data1$zif1_gr, test.data1$zif2_gr, test.data1$zif3_gr, test.data1$zif4_gr, test.data1$zif5_gr, test.data1$digitnr_gr)
zaehl1 <- rep(1, 5)
zaehl1[1] <- length(which(test.data1$digitnr_gr==1))
zaehl1[2] <- length(which(test.data1$digitnr_gr==2))
zaehl1[3] <- length(which(test.data1$digitnr_gr==3))
zaehl1[4] <- length(which(test.data1$digitnr_gr==4))
zaehl1[5] <- length(which(test.data1$digitnr_gr==5))
theor.ver.gr1 <- zaehl1[1]/sum(zaehl1)*benford1 + zaehl1[2]/sum(zaehl1)*benford2 + zaehl1[3]/sum(zaehl1)*benford3 + zaehl1[4]/sum(zaehl1)*benford4 + zaehl1[5]/sum(zaehl1)*benford5
lastdit.gr1 <- round(test.data1$zif1_gr)
chi2.digitest(lastdit.gr1, 0.05, vec.th=theor.ver.gr1)


### People's Amendment
# YES
data.vv1 <- cbind(test.data11$lostballots, test.data11$yes_envv, test.data11$zif1_vv, test.data11$zif2_vv, test.data11$zif3_vv, test.data11$zif4_vv, test.data11$zif5_vv, test.data11$digitnr_vv)
zaehl1 <- rep(1, 5)
zaehl1[1] <- length(which(test.data11$digitnr_vv==1))
zaehl1[2] <- length(which(test.data11$digitnr_vv==2))
zaehl1[3] <- length(which(test.data11$digitnr_vv==3))
zaehl1[4] <- length(which(test.data11$digitnr_vv==4))
zaehl1[5] <- length(which(test.data11$digitnr_vv==5))
theor.ver.vv1 <- zaehl1[1]/sum(zaehl1)*benford1 + zaehl1[2]/sum(zaehl1)*benford2 + zaehl1[3]/sum(zaehl1)*benford3 + zaehl1[4]/sum(zaehl1)*benford4 + zaehl1[5]/sum(zaehl1)*benford5
lastdit.vv1 <- round(test.data11$zif1_vv)
chi2.digitest(lastdit.vv1, 0.05, vec.th=theor.ver.vv1)
# NO
data.vv1 <- cbind(test.data1$lostballots, test.data1$yes_envv, test.data1$zif1_vv, test.data1$zif2_vv, test.data1$zif3_vv, test.data1$zif4_vv, test.data1$zif5_vv, test.data1$digitnr_vv)
zaehl1 <- rep(1, 5)
zaehl1[1] <- length(which(test.data1$digitnr_vv==1))
zaehl1[2] <- length(which(test.data1$digitnr_vv==2))
zaehl1[3] <- length(which(test.data1$digitnr_vv==3))
zaehl1[4] <- length(which(test.data1$digitnr_vv==4))
zaehl1[5] <- length(which(test.data1$digitnr_vv==5))
theor.ver.vv1 <- zaehl1[1]/sum(zaehl1)*benford1 + zaehl1[2]/sum(zaehl1)*benford2 + zaehl1[3]/sum(zaehl1)*benford3 + zaehl1[4]/sum(zaehl1)*benford4 + zaehl1[5]/sum(zaehl1)*benford5
lastdit.vv1 <- round(test.data1$zif1_vv)
chi2.digitest(lastdit.vv1, 0.05, vec.th=theor.ver.vv1)


### Tie-Break Question
# YES
data.st1 <- cbind(test.data11$lostballots, test.data11$yes_enst, test.data11$zif1_st, test.data11$zif2_st, test.data11$zif3_st, test.data11$zif4_st, test.data11$zif5_st, test.data11$digitnr_st)
zaehl1 <- rep(1, 5)
zaehl1[1] <- length(which(test.data11$digitnr_st==1))
zaehl1[2] <- length(which(test.data11$digitnr_st==2))
zaehl1[3] <- length(which(test.data11$digitnr_st==3))
zaehl1[4] <- length(which(test.data11$digitnr_st==4))
zaehl1[5] <- length(which(test.data11$digitnr_st==5))
theor.ver.st1 <- zaehl1[1]/sum(zaehl1)*benford1 + zaehl1[2]/sum(zaehl1)*benford2 + zaehl1[3]/sum(zaehl1)*benford3 + zaehl1[4]/sum(zaehl1)*benford4 + zaehl1[5]/sum(zaehl1)*benford5
lastdit.st1 <- round(test.data11$zif1_st)
chi2.digitest(lastdit.st1, 0.05, vec.th=theor.ver.st1)
# NO
data.st1 <- cbind(test.data1$lostballots, test.data1$yes_enst, test.data1$zif1_st, test.data1$zif2_st, test.data1$zif3_st, test.data1$zif4_st, test.data1$zif5_st, test.data1$digitnr_st)
zaehl1 <- rep(1, 5)
zaehl1[1] <- length(which(test.data1$digitnr_st==1))
zaehl1[2] <- length(which(test.data1$digitnr_st==2))
zaehl1[3] <- length(which(test.data1$digitnr_st==3))
zaehl1[4] <- length(which(test.data1$digitnr_st==4))
zaehl1[5] <- length(which(test.data1$digitnr_st==5))
theor.ver.st1 <- zaehl1[1]/sum(zaehl1)*benford1 + zaehl1[2]/sum(zaehl1)*benford2 + zaehl1[3]/sum(zaehl1)*benford3 + zaehl1[4]/sum(zaehl1)*benford4 + zaehl1[5]/sum(zaehl1)*benford5
lastdit.st1 <- round(test.data1$zif1_st)
chi2.digitest(lastdit.st1, 0.05, vec.th=theor.ver.st1)





##################################################################
# Appendix
##################################################################

# 
KS.testB <- function(x, w){
		y.th <- rep(0,w[1]) 
		for (t in 2:10){
			r <- rep(t-1,w[t])
			y.th <- c(y.th,r)
		}
		D <- as.numeric(suppressWarnings(ks.test(x,y.th)$statistic))
	# Stephens (1970)
	sqrt.N <- sqrt(length(x))
	stephens.D <- D*(sqrt.N+0.12+0.11/sqrt.N)	
	# Morrow (2010)
	if(stephens.D<1.148) pass <- 1
	if(stephens.D>1.148) pass <- 0
	return(pass)
	}

set.seed(12345)
m <- 1000		# number of simulations
n <- 13			# number of sample sizes
K <- 100		# number of runs to get uncertainty


how.m <- function(x) {length(which(x<0.05))}
pos.testB <- matrix(NA,K,n)

sampleL <- c(10,15,20,25,30,35,40,45,50,55,60,65,70)

for (k in 1:K){
		storL <- matrix(NA, m,n)
		for (j in 1:n){
			for (i in 1:m){
					N <- sampleL[j]
					x <- c(rep(0,150),rep(1,150),rep(2,150),rep(3,450),rep(4,150),rep(5,150),rep(6,150),rep(7,450),rep(8,150),rep(9,150))
					dig <- sample(x,N)
					storL[i,j] <- chi2.digitest22(dig, 0.05)
				}
		}
	pos.testB[k,] <- apply(storL,2,how.m)
	}

ssamp <- rep(sampleL, each=100)

par(mai=c(1,1,1,1), mfrow=c(1,2))
plot(ssamp,pos.testB/1000,col=rgb(0,0,150,50,maxColorValue=255), pch=19, ylim=c(0,0.999), ylab=expression("Share of Rejected H"[0]), xlab="Sample Size", bty="n", cex=0.3)
points(sampleL,colMeans(pos.testB)/1000, pch=19, cex=1.4,col=rgb(0,0,150,100,maxColorValue=255))


M <- 1000		# number of simulations
J <- 13			# number of sample sizes

K <- 100
pos.test <- matrix(NA,K,n)
w.val <- rep(100,10)

sampleL <- c(10,15,20,25,30,35,40,45,50,55,60,65,70)
x <- c(rep(0,150),rep(1,150),rep(2,150),rep(3,450),rep(4,150),rep(5,150),rep(6,150),rep(7,450),rep(8,150),rep(9,150))

# K-S
for (k in 1:K){
		storL <- matrix(NA, M,J)
		for (j in 1:J){
			for (i in 1:M){
					N <- sampleL[j]
					dig <- sample(x,N)
					storL[i,j] <- KS.testB(dig, w.val)
				}
		}
	print(k)
	pos.test[k,] <- 	colMeans(storL)
}

ssamp <- rep(sampleL, each=K)
points(ssamp, 1-pos.test, col=rgb(150,0,0,50,maxColorValue=255), pch=19, cex=0.3)
points(sampleL, colMeans(1-pos.test), col=rgb(150,0,0,100,maxColorValue=255), pch=19, cex=1.4)

legend(10,0.9,legend=c("K-S Test", expression(paste("Pearson's ", chi^2, " Test"))),col=c(rgb(0,0,150,200,maxColorValue=255),rgb(150,0,0,200,maxColorValue=255)),pch=19, bty="n", cex=1.5)



plot(1,1,col=rgb(1,1,1,0,maxColorValue=255), ylim=c(0,0.999), xlim=c(8,71))
rect(9,0,15,0.15,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(15,0,21,0.15,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(21,0,27,0.15,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(27,0,33,0.45,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(33,0,39,0.15,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(39,0,45,0.15,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(45,0,51,0.15,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(51,0,57,0.45,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(57,0,63,0.15,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
rect(63,0,69,0.15,col=rgb(50,50,50,20,maxColorValue=255),lwd=0.1)
text(12,0.12,"'0'",col=rgb(50,50,50,100,maxColorValue=255))
text(18,0.12,"'1'",col=rgb(50,50,50,100,maxColorValue=255))
text(24,0.12,"'2'",col=rgb(50,50,50,100,maxColorValue=255))
text(30,0.35,"'3'",col=rgb(50,50,50,100,maxColorValue=255))
text(36,0.12,"'4'",col=rgb(50,50,50,100,maxColorValue=255))
text(42,0.12,"'5'",col=rgb(50,50,50,100,maxColorValue=255))
text(48,0.12,"'6'",col=rgb(50,50,50,100,maxColorValue=255))
text(54,0.35,"'7'",col=rgb(50,50,50,100,maxColorValue=255))
text(60,0.12,"'8'",col=rgb(50,50,50,100,maxColorValue=255))
text(66,0.12,"'9'",col=rgb(50,50,50,100,maxColorValue=255))


############################################################



